#!pip install matplotlib seaborn mlxtend plotly
#basic libraries: numpy and pandasfor data handling, pyplot
#and seaborn for visualization, math for mathematical operations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from datetime import date
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as stats
import plotly.express as px
#dataset partition
from sklearn.model_selection import train_test_split
#feature selection methods
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
#scaling methods and categorical variable encoder
from sklearn.preprocessing import RobustScaler, OneHotEncoder
#model selection
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
#linear models
from sklearn.linear_model import LogisticRegression, SGDClassifier
#gaussian naive bayes
from sklearn.naive_bayes import GaussianNB
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
#k-neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer
#principal component analysis
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
#neural network
from sklearn.neural_network import MLPClassifier
#ensemble classifier models
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, \
VotingClassifier, AdaBoostClassifier, StackingClassifier, HistGradientBoostingClassifier, \
ExtraTreesClassifier
#support vector machines
from sklearn.svm import SVC
#model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
make_scorer, classification_report, confusion_matrix, f1_score
from itertools import combinations
from collections import Counter
train_data_original = pd.read_csv('train.csv')
test_data_original = pd.read_csv('test.csv')
df = train_data_original.copy()
df_test = test_data_original.copy()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15589 entries, 0 to 15588 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Cust_ID 15589 non-null int64 1 Churn 15589 non-null object 2 Name 15589 non-null object 3 Longevity 15589 non-null object 4 Year_Birth 15394 non-null float64 5 TypeTravel 15589 non-null object 6 RoomType 15589 non-null object 7 RewardPoints 15589 non-null int64 8 Comfort 15589 non-null int64 9 ReceptionSchedule 15589 non-null int64 10 FoodDrink 15589 non-null int64 11 Location 15589 non-null int64 12 Wifi 15589 non-null int64 13 Amenities 15589 non-null int64 14 Staff 15589 non-null int64 15 OnlineBooking 15589 non-null int64 16 PriceQuality 15589 non-null int64 17 RoomSpace 15589 non-null int64 18 CheckOut 15589 non-null int64 19 Checkin 15589 non-null int64 20 Cleanliness 15589 non-null int64 21 BarService 15589 non-null int64 dtypes: float64(1), int64(16), object(5) memory usage: 2.6+ MB
df_original_row_size = df.shape[0]
df_original_columns_size = df.shape[1]
print("Dataframe has", df_original_row_size, "rows and", df_original_columns_size, "columns")
Dataframe has 15589 rows and 22 columns
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Cust_ID | 15589.0 | 7795.000000 | 4500.301008 | 1.0 | 3898.0 | 7795.0 | 11692.0 | 15589.0 |
| Year_Birth | 15394.0 | 1981.706444 | 15.179042 | 1936.0 | 1970.0 | 1981.0 | 1994.0 | 2014.0 |
| RewardPoints | 15589.0 | 5022.593816 | 1027.962379 | 409.0 | 4445.0 | 5088.0 | 5649.0 | 6950.0 |
| Comfort | 15589.0 | 2.841619 | 1.388624 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| ReceptionSchedule | 15589.0 | 2.997242 | 1.518994 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| FoodDrink | 15589.0 | 2.844570 | 1.436948 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Location | 15589.0 | 2.986016 | 1.299438 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 |
| Wifi | 15589.0 | 3.245109 | 1.327026 | 0.0 | 2.0 | 3.0 | 4.0 | 6.0 |
| Amenities | 15589.0 | 3.374816 | 1.352417 | 0.0 | 2.0 | 4.0 | 4.0 | 5.0 |
| Staff | 15589.0 | 3.506383 | 1.319565 | 1.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| OnlineBooking | 15589.0 | 3.454231 | 1.310343 | 0.0 | 2.0 | 4.0 | 5.0 | 5.0 |
| PriceQuality | 15589.0 | 3.459683 | 1.268130 | 1.0 | 3.0 | 4.0 | 4.0 | 5.0 |
| RoomSpace | 15589.0 | 3.470845 | 1.293873 | 0.0 | 2.0 | 4.0 | 5.0 | 5.0 |
| CheckOut | 15589.0 | 3.700558 | 1.158644 | 1.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| Checkin | 15589.0 | 3.327282 | 1.266872 | 1.0 | 3.0 | 3.0 | 4.0 | 5.0 |
| Cleanliness | 15589.0 | 3.692347 | 1.154437 | 1.0 | 3.0 | 4.0 | 5.0 | 5.0 |
| BarService | 15589.0 | 3.347360 | 1.300452 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 |
check = df.copy()
check['Gender'] = [1 if i == 'Mr.' else 0 for i in check["Name"].str[:3]]
f, axes = plt.subplots(2,2, figsize=(20, 15), squeeze=False)
sns.countplot(x='RoomType', hue='Churn', data=check, color='darkseagreen', ax=axes[0, 0])
sns.countplot(x='TypeTravel', hue='Churn', data=check, color='tan', ax=axes[0, 1])
sns.countplot(x='Longevity', hue='Churn', data=check, color='cadetblue', ax=axes[1, 0])
sns.countplot(x='Gender', hue='Churn', data=check, color='dimgrey', ax=axes[1, 1])
<AxesSubplot:xlabel='Gender', ylabel='count'>
#Reward Points
fig = px.box(data_frame=check, x='Churn', y='RewardPoints')
fig.show()
#Year Birth
fig = px.box(data_frame=check, x='Churn', y='Year_Birth')
fig.show()
#Comfort
fig = px.box(data_frame=check, x='Churn', y='Comfort')
fig.show()
#ReceptionSchedule
fig = px.box(data_frame=check, x='Churn', y='ReceptionSchedule')
fig.show()
#FoodDrink
fig = px.box(data_frame=check, x='Churn', y='FoodDrink')
fig.show()
#Location
fig = px.box(data_frame=check, x='Churn', y='Location')
fig.show()
#Wifi
fig = px.box(data_frame=check, x='Churn', y='Wifi')
fig.show()
#Amenities
fig = px.box(data_frame=check, x='Churn', y='Amenities')
fig.show()
#Staff
fig = px.box(data_frame=check, x='Churn', y='Staff')
fig.show()
#Online Booking
fig = px.box(data_frame=check, x='Churn', y='OnlineBooking')
fig.show()
#Price Quality
fig = px.box(data_frame=check, x='Churn', y='PriceQuality')
fig.show()
#Room Space
fig = px.box(data_frame=check, x='Churn', y='RoomSpace')
fig.show()
#Check Out
fig = px.box(data_frame=check, x='Churn', y='CheckOut')
fig.show()
#Checkin
fig = px.box(data_frame=check, x='Churn', y='Checkin')
fig.show()
#Cleanliness
fig = px.box(data_frame=check, x='Churn', y='Cleanliness')
fig.show()
#Bar Service
fig = px.box(data_frame=check, x='Churn', y='BarService')
fig.show()
df.skew().sort_values()
CheckOut -0.750689 Cleanliness -0.745131 Amenities -0.599498 Staff -0.554561 PriceQuality -0.503381 RoomSpace -0.482952 OnlineBooking -0.472074 RewardPoints -0.453779 Checkin -0.382588 BarService -0.358297 ReceptionSchedule -0.260705 Wifi -0.171255 FoodDrink -0.123610 Comfort -0.100907 Location -0.050229 Year_Birth -0.003847 Cust_ID 0.000000 dtype: float64
df.kurt().sort_values()
Cust_ID -1.200000 Wifi -1.111251 ReceptionSchedule -1.077243 Location -1.076583 FoodDrink -0.967047 BarService -0.949765 Comfort -0.935544 OnlineBooking -0.931811 RoomSpace -0.864644 Staff -0.858889 Checkin -0.812149 PriceQuality -0.772663 Year_Birth -0.729800 Amenities -0.540657 Cleanliness -0.225183 CheckOut -0.225016 RewardPoints 0.260135 dtype: float64
df.isna().sum()
Cust_ID 0 Churn 0 Name 0 Longevity 0 Year_Birth 195 TypeTravel 0 RoomType 0 RewardPoints 0 Comfort 0 ReceptionSchedule 0 FoodDrink 0 Location 0 Wifi 0 Amenities 0 Staff 0 OnlineBooking 0 PriceQuality 0 RoomSpace 0 CheckOut 0 Checkin 0 Cleanliness 0 BarService 0 dtype: int64
# Change the data frame index to the customer ID of each entry, rather than the standard index
df.set_index("Cust_ID", inplace = True)
df_test.set_index("Cust_ID", inplace = True)
df['Churn'].value_counts()
sns.countplot(df["Churn"], color="skyblue")
plt.show()
Churn is our target variable and it looks somewhat evenly distributed. We will still test with over and under sampled datasets but there is not a huge descrepancy between the values of Churn.
df['Name'].value_counts().nlargest(1000)
Mr. Michael Smith 9
Ms. Amanda Smith 7
Mr. John Smith 7
Mr. Michael Jones 6
Mr. William Smith 6
..
Ms. Elizabeth Adams 2
Mr. Daniel Hall 2
Mr. William Wright 2
Ms. Sandra Lopez 2
Mr. Joseph Martinez 2
Name: Name, Length: 1000, dtype: int64
df['Longevity'].value_counts()
yes 12548 no 2874 y 167 Name: Longevity, dtype: int64
Variable does not look good because we have entries with 'yes' and entries with 'y' which most likely mean the same. Will will transform the entries with 'y' into 'yes'
df['Longevity'].replace('y','yes', inplace=True)
df['Longevity'].value_counts()
sns.countplot(df["Longevity"], color="skyblue")
plt.show()
df['Year_Birth'].value_counts()
1982.0 441
1996.0 416
1980.0 383
1998.0 380
1981.0 369
...
1947.0 9
1945.0 8
1943.0 6
1942.0 3
1936.0 2
Name: Year_Birth, Length: 75, dtype: int64
sns.histplot(df["Year_Birth"], color="skyblue")
<AxesSubplot:xlabel='Year_Birth', ylabel='Count'>
df['TypeTravel'].value_counts()
business 10756 leisure 4833 Name: TypeTravel, dtype: int64
sns.countplot(df["TypeTravel"], color="skyblue")
<AxesSubplot:xlabel='TypeTravel', ylabel='count'>
df['RoomType'].value_counts()
single 7442 double 7021 suite 1126 Name: RoomType, dtype: int64
sns.countplot(df["RoomType"], color="skyblue")
<AxesSubplot:xlabel='RoomType', ylabel='count'>
f, axes = plt.subplots(3,5, figsize=(20, 15), squeeze=False)
sns.histplot(df["Comfort"], color="skyblue", ax=axes[0, 0])
sns.histplot(df["ReceptionSchedule"], color="skyblue", ax=axes[0, 1])
sns.histplot(df["FoodDrink"], color="skyblue", ax=axes[0, 2])
sns.histplot(df["Location"], color="skyblue", ax=axes[0, 3])
sns.histplot(df["Wifi"], color="skyblue", ax=axes[0, 4])
sns.histplot(df["Amenities"], color="skyblue", ax=axes[1, 0])
sns.histplot(df["Staff"], color="skyblue", ax=axes[1, 1])
sns.histplot(df["OnlineBooking"], color="skyblue", ax=axes[1, 2])
sns.histplot(df["PriceQuality"], color="skyblue", ax=axes[1, 3])
sns.histplot(df["RoomSpace"], color="skyblue", ax=axes[1, 4])
sns.histplot(df["CheckOut"], color="skyblue", ax=axes[2, 0])
sns.histplot(df["Checkin"], color="skyblue", ax=axes[2, 1])
sns.histplot(df["Cleanliness"], color="skyblue", ax=axes[2, 2])
sns.histplot(df["BarService"], color="skyblue", ax=axes[2, 3])
plt.show()
Upon looking at the charts, we noticed that 'Wifi' has values that should not be there, in particular, some surveys were marked with 6 which is not an available rating.
Since there are only 36 rows with value 6, we will convert them to a 5.
df['Wifi'].replace(6,5, inplace=True)
sns.histplot(df["Wifi"], color="skyblue")
plt.show()
#create duplicate datafranme
duplicates = df.duplicated()
#visualize the duplicates
df[duplicates]
| Churn | Name | Longevity | Year_Birth | TypeTravel | RoomType | RewardPoints | Comfort | ReceptionSchedule | FoodDrink | ... | Wifi | Amenities | Staff | OnlineBooking | PriceQuality | RoomSpace | CheckOut | Checkin | Cleanliness | BarService | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cust_ID | |||||||||||||||||||||
| 8196 | nochurn | Ms. Abigail York | yes | 1995.0 | leisure | double | 5098 | 5 | 5 | 5 | ... | 4 | 5 | 5 | 3 | 3 | 4 | 3 | 3 | 3 | 5 |
| 9177 | churn | Ms. Abigail Kennedy | yes | 1991.0 | business | suite | 5932 | 3 | 3 | 2 | ... | 3 | 3 | 3 | 3 | 4 | 1 | 4 | 3 | 4 | 3 |
| 9418 | nochurn | Ms. Abigail Buchanan | yes | 1972.0 | business | double | 6769 | 5 | 4 | 4 | ... | 5 | 5 | 4 | 5 | 5 | 5 | 5 | 2 | 5 | 1 |
3 rows × 21 columns
df.drop_duplicates(inplace = True)
df.shape
(15586, 21)
f, axes = plt.subplots(4,4, figsize=(20, 15), squeeze=False)
sns.boxplot(df["Year_Birth"], ax=axes[0, 0])
sns.boxplot(df["RewardPoints"], ax=axes[0, 1])
sns.boxplot(df["Comfort"], ax=axes[0, 2])
sns.boxplot(df["ReceptionSchedule"], ax=axes[0, 3])
sns.boxplot(df["FoodDrink"], ax=axes[1, 0])
sns.boxplot(df["Location"], ax=axes[1, 1])
sns.boxplot(df["Wifi"], ax=axes[1, 2])
sns.boxplot(df["Amenities"], ax=axes[1, 3])
sns.boxplot(df["Staff"], ax=axes[2, 0])
sns.boxplot(df["OnlineBooking"], ax=axes[2, 1])
sns.boxplot(df["PriceQuality"], ax=axes[2, 2])
sns.boxplot(df["RoomSpace"], ax=axes[2, 3])
sns.boxplot(df["CheckOut"], ax=axes[3, 0])
sns.boxplot(df["Checkin"], ax=axes[3, 1])
sns.boxplot(df["Cleanliness"], ax=axes[3, 2])
sns.boxplot(df["BarService"], ax=axes[3, 3])
plt.show()
#method to return the boundaries of IQR
def get_IQR_bounds(s):
q1 = s.quantile(0.25)
q3 = s.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 -(1.5 * iqr)
upper_bound = q3 +(1.5 * iqr)
return (lower_bound,upper_bound)
df = df[df['RewardPoints'] > get_IQR_bounds(df['RewardPoints'])[0]]
print("There are been",df_original_row_size - df.shape[0], "outliers removed")
There are been 293 outliers removed
sns.histplot(df["RewardPoints"], color="skyblue")
plt.show()
After some research and analysis on the subject, we decided not to remove outliers on the two ratings variables that presented outliers in the boxplot visualization. Those are 'PriceQuality' and 'Checkin'. The reason we did not remove them was because they were too many entries that would have to be removed, and if we did, we would effectively be shortening the rating scale for the rating data.
Instead, we will look for straight lining on the ratings entries, meaning people who answered all the answers with the same variable, which can mean they were in a rush and decided to fill the survey as fast as they could.
straight_lining_entries = df[(df['Comfort'] == df['ReceptionSchedule']) & (df['Comfort'] == df['FoodDrink']) & (df['Comfort'] == df['Location']) & (df['Comfort'] == df['Wifi']) & (df['Comfort'] == df['Amenities']) & (df['Comfort'] == df['Staff']) & (df['Comfort'] == df['OnlineBooking']) & (df['Comfort'] == df['PriceQuality']) & (df['Comfort'] == df['RoomSpace']) & (df['Comfort'] == df['CheckOut']) & (df['Comfort'] == df['Checkin']) & (df['Comfort'] == df['Cleanliness']) & (df['Comfort'] == df['BarService'])]
straight_lining_entries
| Churn | Name | Longevity | Year_Birth | TypeTravel | RoomType | RewardPoints | Comfort | ReceptionSchedule | FoodDrink | ... | Wifi | Amenities | Staff | OnlineBooking | PriceQuality | RoomSpace | CheckOut | Checkin | Cleanliness | BarService | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Cust_ID | |||||||||||||||||||||
| 1549 | nochurn | Mr. Marcus Burns | yes | 1981.0 | business | single | 5289 | 4 | 4 | 4 | ... | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| 15260 | nochurn | Mr. Randy Robbins | yes | 1976.0 | business | single | 6699 | 5 | 5 | 5 | ... | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
2 rows × 21 columns
df.drop(straight_lining_entries.index, axis=0, inplace=True)
We found 2 entries that are considered straight lining answers and we will remove them from the data
#churn = 1, nochurn = 0
df['Churn'] = [1 if i == 'churn' else 0 for i in df["Churn"]]
df_data = df.drop(['Churn'], axis=1).copy()
df_target = df['Churn'].copy()
#Method to transform the data
def transform_data(X, age=False):
print()
# get all numerical variables
X_train_num = X[num_vars]
#
# get all categorical variables
X_train_cat = X[cat_vars]
#df_scaled['Longevity'] = [1 if i == 'yes' else 0 for i in df_scaled["Longevity"]]
#df_scaled['TypeTravel'] = [1 if i == 'business' else 0 for i in df_scaled["TypeTravel"]]
# fill missing values (KNN Imputer for Year of Birth: first variable of numerical variables)
k_imputer = round(np.sqrt(len(X_train_num)),0).astype('int32') # 125
imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
imputer.fit(X_train_num)
data_KNN_train = imputer.transform(X_train_num)
data_KNN_train = pd.DataFrame(data_KNN_train)
data_KNN_train[0] = data_KNN_train[0].round(0)
X_train_num['Year_Birth'] = data_KNN_train[0].values
if age:
X_train_num['Age'] = date.today().year - X_train_num['Year_Birth']
# Apply scaling to numerical data
scaler = MinMaxScaler().fit(X_train_num)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_num), columns = X_train_num.columns, index = X_train_num.index,) # MinMaxScaler in the training data
return X_train_scaled, X_train_cat
#Gender Variable
df_data["Characters"] = df_data["Name"].str[:3]
df_data['Characters'].unique()
df_data.drop(columns=["Characters"], axis=1, inplace=True)
df_data['Gender'] = ['Male' if i == 'Mr.' else 'Female' for i in df_data["Name"].str[:3]]
df_data.drop(columns=['Name'], inplace=True)
cat_vars = ['Gender', 'Longevity', 'TypeTravel', 'RoomType']
num_vars = list(set(df_data.columns) - set(cat_vars))
def select_best_features(X, y, splits, age=False):
sel_log = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sel_tree = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sel_forest = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sel_lasso = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
if age:
sel_log.append(0)
sel_tree.append(0)
sel_forest.append(0)
sel_lasso.append(0)
skf = StratifiedKFold(n_splits = splits, shuffle=True, random_state=42)
counter = 0
for train_index, val_index in skf.split(X, y):
counter +=1
print('')
print('--------------------------------------------------------')
print('SPLIT ', counter)
print('--------------------------------------------------------')
print('')
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]
X_train_scaled, X_train_cat = transform_data(X_train, age)
X_val_scaled, X_val_cat = transform_data(X_val, age)
# Check which features to use using RFE and Logistic Regression
print('')
print('----------------- RFE ----------------------')
model = LogisticRegression()
rfe = RFE(estimator = model, n_features_to_select = 5)
X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
sel_log=np.add(sel_log,list(map(int, selected_features)))
# Check which features to use using RFE and decision tree
model = DecisionTreeClassifier()
rfe = RFE(estimator = model, n_features_to_select = 5)
X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
sel_tree=np.add(sel_tree,list(map(int, selected_features)))
# Check which features to use using RFE and random forest
model = RandomForestClassifier()
rfe = RFE(estimator = model, n_features_to_select = 5)
X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
sel_forest=np.add(sel_forest,list(map(int, selected_features)))
#Lasso
reg = LassoCV()
reg.fit(X=X_train_scaled, y=y_train.replace('nochurn',0).replace('churn',1))
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X = X_train_scaled,y = y_train.replace('nochurn',0).replace('churn',1)))
coef = pd.Series(reg.coef_, index = X_train_scaled.columns)
sel_lasso=np.add(sel_lasso,coef.values)
# Check which features to use using Chi-Square
print('')
print('----------------- CHI-SQUARE ----------------------')
def TestIndependence(X,y,var,alpha=0.05):
dfObserved = pd.crosstab(y,X)
chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
if p<alpha:
result="{0} is IMPORTANT for Prediction".format(var)
else:
result="{0} is NOT important for Prediction. (Discard {0} from model)".format(var)
print(result)
for var in X_train_cat:
TestIndependence(X_train_cat[var],y_train, var)
sel_lasso = sel_lasso/splits
if age==True:
num_vars.append("Age")
final = pd.DataFrame(np.array([sel_log,sel_tree,sel_forest,sel_lasso]),\
columns=set(num_vars),index=['Logistic Regression','Decision Tree','Random Forest','Lasso'])
print(final.T)
select_best_features(df_data, df_target, 5)
--------------------------------------------------------
SPLIT 1
--------------------------------------------------------
----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000070
Best score using built-in LassoCV: 0.408180
----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
--------------------------------------------------------
SPLIT 2
--------------------------------------------------------
----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000080
Best score using built-in LassoCV: 0.411028
----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
--------------------------------------------------------
SPLIT 3
--------------------------------------------------------
----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000113
Best score using built-in LassoCV: 0.403751
----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
--------------------------------------------------------
SPLIT 4
--------------------------------------------------------
----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000185
Best score using built-in LassoCV: 0.410113
----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
--------------------------------------------------------
SPLIT 5
--------------------------------------------------------
----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000158
Best score using built-in LassoCV: 0.405094
----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
Logistic Regression Decision Tree Random Forest Lasso
Year_Birth 0.0 5.0 5.0 0.020899
Location 0.0 1.0 0.0 -0.015053
ReceptionSchedule 5.0 5.0 5.0 -0.165488
Checkin 4.0 0.0 0.0 -0.038071
Amenities 1.0 0.0 0.0 -0.029079
CheckOut 5.0 4.0 5.0 -0.067565
BarService 0.0 5.0 5.0 -0.005781
Cleanliness 5.0 0.0 0.0 -0.049719
Staff 5.0 0.0 0.0 -0.057884
OnlineBooking 0.0 0.0 0.0 0.034264
RewardPoints 0.0 0.0 0.0 -0.000308
Wifi 0.0 0.0 0.0 -0.018957
FoodDrink 0.0 0.0 0.0 -0.000017
Comfort 0.0 0.0 0.0 0.015966
PriceQuality 0.0 0.0 0.0 0.028827
RoomSpace 0.0 5.0 5.0 -0.033102
def cor_heatmap(cor):
plt.figure(figsize=(12,10))
sns.heatmap(data = cor, annot = True, fmt='.1')
plt.show()
cor_heatmap(df_data.corr(method = 'spearman'))
| Predictor | RFE Logistic | RFE Decision Tree | RFE Random Forest | Lasso | Correlation | Include in the model? |
|---|---|---|---|---|---|---|
| Year_Birth | Keep | Keep | Keep | Keep? | Keep | Keep |
| RewardPoints | Discard | Discard | Discard | Discard | Keep | Discard |
| Comfort | Discard | Discard | Discard | Discard | Discard | Discard |
| ReceptionSchedule | Keep? | Discard | Keep | Keep? | Keep | Discard |
| FoodDrink | Keep | Discard | Discard | Keep | Keep | Keep |
| Location | Keep | Discard | Discard | Keep | Keep | Keep |
| Wifi | Discard | Discard | Discard | Keep | Keep | Discard |
| Amenities | Discard | Keep? | Discard | Keep? | Keep | Keep? |
| Staff | Discard | Keep | Discard | Keep? | Discard | Discard |
| OnlineBooking | Discard | Keep | Keep | Discard | Discard | Discard |
| PriceQuality | Keep | Discard | Discard | Keep? | Keep | Discard |
| RoomSpace | Keep | Keep | Keep | Keep | Keep | Keep |
| CheckOut | Discard | Keep | Keep | Keep? | Keep | Keep? |
| CheckIn | Discard | Discard | Discard | Keep? | Keep | Discard |
| Cleanliness | Discard | Discard | Discard | Keep? | Keep | Discard |
| BarService | Discard | Discard | Discard | Discard | Discard | Discard |
selected_cat = ['Longevity','TypeTravel','RoomType','Gender']
selected_num = ['Year_Birth', 'FoodDrink', 'Location', 'Amenities', 'RoomSpace', 'CheckOut']
features_to_drop = ['RewardPoints','Comfort','ReceptionSchedule','Wifi','Staff','OnlineBooking','PriceQuality',
'Checkin','Cleanliness','BarService']
df_data.drop(columns = features_to_drop, inplace = True, axis=1)
#yes = 1, no = 0
df_data['Longevity'] = [1 if i == 'yes' else 0 for i in df_data["Longevity"]]
#business = 1, leisure = 0
df_data['TypeTravel'] = [1 if i == 'business' else 0 for i in df_data["TypeTravel"]]
#Male = 1, Female = 0
df_data['Gender'] = [1 if i == 'Male' else 0 for i in df_data["Gender"]]
df_data = pd.get_dummies(df_data, drop_first = True)
scaler = MinMaxScaler()
df_data = pd.DataFrame(scaler.fit_transform(df_data), index=df_data.index, columns = df_data.columns)
def compare_models(X, y, model):
# apply StratifiedK-Fold
skf = StratifiedKFold(n_splits = 5)
score_train = []
score_val = []
for train_index, val_index in skf.split(X, y):
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]
# This time we are going to use validation to check overfitting
# so we need also to make all the needed changes in the validation
# fill missing values (mean in numerical data, mode in categorical data)
#median_age_train = X_train['age'].median() # age is no longer used
#X_train['age'].fillna(median_age_train, inplace = True)
#X_val['age'].fillna(median_age_train, inplace = True)
k_imputer = round(np.sqrt(len(X_train[selected_num])),0).astype('int32') # 125
imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
imputer.fit(X_train[selected_num])
data_KNN_train = imputer.transform(X_train[selected_num])
data_KNN_train = pd.DataFrame(data_KNN_train)
data_KNN_train[0] = data_KNN_train[0].round(0)
X_train['Year_Birth'] = data_KNN_train[0].values
# Use Train Imputer for Validation Data
data_KNN_val = imputer.transform(X_val[selected_num])
data_KNN_val = pd.DataFrame(data_KNN_val)
data_KNN_val[0] = data_KNN_val[0].round(0)
X_val['Year_Birth'] = data_KNN_val[0].values
# Data Scaling
# Apply MinMaxScaler
#scaler = MinMaxScaler().fit(X_train[selected_num])
#X_train_scaled = scaler.transform(X_train[selected_num])
#X_val_scaled = scaler.transform(X_val[selected_num]) # Scaling with 'scaler' from train data
# Apply model
model.fit(X_train, y_train)
predictions_train = model.predict(X_train)
predictions_val = model.predict(X_val)
score_train.append(f1_score(y_train, predictions_train))
score_val.append(f1_score(y_val, predictions_val))
avg_train = round(np.mean(score_train),3)
avg_val = round(np.mean(score_val),3)
std_train = round(np.std(score_train),2)
std_val = round(np.std(score_val),2)
return str(avg_train) + '+/-' + str(std_train),str(avg_val) + '+/-' + str(std_val)
def show_results(df, X, y, *args):
"""
Receive an empty dataframe and the different models and call the function avg_score
"""
count = 0
# for each model passed as argument
for arg in args:
# obtain the results provided by avg_score
avg_train, avg_test = compare_models(X, y, arg)
# store the results in the right row
df.iloc[count] = avg_train, avg_test
count+=1
return df
model_LR = LogisticRegression()
model_KNN = KNeighborsClassifier()
model_GB = GradientBoostingClassifier()
model_HGB = HistGradientBoostingClassifier()
model_AB = AdaBoostClassifier()
model_SVC = SVC()
model_G = GaussianNB()
model_ET = ExtraTreesClassifier()
model_RF = RandomForestClassifier()
model_DT = DecisionTreeClassifier()
model_MLP = MLPClassifier()
df = pd.DataFrame(columns = ['Train','Validation'], index =
['Logistic Regression','KNN', 'GradientBoost',
'HistGradientBoost', 'AdaBoost',
'SVC', 'Gaussian', 'ExtraTrees',
'RandomForest', 'DecisonTree',
'MLPClassifier'])
show_results(df, df_data, df_target,
model_LR, model_KNN, model_GB,
model_HGB, model_AB, model_SVC,
model_G, model_ET, model_RF,
model_DT, model_MLP)
| Train | Validation | |
|---|---|---|
| Logistic Regression | 0.787+/-0.0 | 0.787+/-0.01 |
| KNN | 0.896+/-0.0 | 0.855+/-0.0 |
| GradientBoost | 0.87+/-0.0 | 0.866+/-0.0 |
| HistGradientBoost | 0.9+/-0.0 | 0.882+/-0.0 |
| AdaBoost | 0.826+/-0.0 | 0.826+/-0.01 |
| SVC | 0.875+/-0.0 | 0.869+/-0.0 |
| Gaussian | 0.756+/-0.0 | 0.756+/-0.01 |
| ExtraTrees | 0.951+/-0.0 | 0.86+/-0.0 |
| RandomForest | 0.952+/-0.0 | 0.868+/-0.0 |
| DecisonTree | 0.951+/-0.0 | 0.839+/-0.01 |
| MLPClassifier | 0.882+/-0.0 | 0.873+/-0.01 |
#First we must transform the test dataset to be able to be used in modeling
#drop variables
df_test.drop(columns=features_to_drop, inplace = True)
#feature enginering
df_test["Characters"] = df_test["Name"].str[:3]
df_test['Characters'].unique()
df_test.drop(columns=["Characters"], axis=1, inplace=True)
df_test['Gender'] = ['Male' if i == 'Mr.' else 'Female' for i in df_test["Name"].str[:3]]
df_test.drop(columns=['Name'], inplace=True)
#dummies
#yes = 1, no = 0
df_test['Longevity'] = [1 if i == 'yes' else 0 for i in df_test["Longevity"]]
#business = 1, leisure = 0
df_test['TypeTravel'] = [1 if i == 'business' else 0 for i in df_test["TypeTravel"]]
#Male = 1, Female = 0
df_test['Gender'] = [1 if i == 'Male' else 0 for i in df_test["Gender"]]
df_test = pd.get_dummies(df_test, drop_first = True)
#scaling
scaler = MinMaxScaler()
df_test = pd.DataFrame(scaler.fit_transform(df_test), index=df_test.index, columns = df_test.columns)
def score(y_val,y_pred):
'''Shows the micro f score, then a complete analysis,
with precision, recall, f1-score, and support, for both
training and validation sets, after that the accuracy, and
finally the base and weighted averages
Requires: the target from the validation dataset
and the corresponding prediction
'''
print('Micro f1 score:', f1_score(y_val, y_pred, average='micro'))
#calculating the micro f1 score
print('\nResults on the data set:')
print(classification_report(y_true = y_val, y_pred = y_pred))
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputer_matrix = imputer.fit_transform(df_data)
data_cleaned = pd.DataFrame(data=imputer_matrix, columns = df_data.columns, index = df_data.index)
df_data['Year_Birth'] = data_cleaned['Year_Birth']
X_train, X_val, y_train, y_val = train_test_split(df_data, df_target, test_size=0.3, random_state=123)
clf = GradientBoostingClassifier()
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.86945 (+/- 0.01)
Micro f1 score: 0.8842885160165613
Results on the data set:
precision recall f1-score support
0 0.89 0.90 0.89 2492
1 0.88 0.86 0.87 2097
accuracy 0.88 4589
macro avg 0.88 0.88 0.88 4589
weighted avg 0.88 0.88 0.88 4589
def search_cv_gb(x_train, y_train, x_val, y_val):
'''Determines the optimal parameters for a random forest classifier for
the input data, and prints the optimal random forest classifier, a full
list of its parameters, and the cross-validation scores for both
training and validation data
Requires: train and validation data, both features and target in both cases
'''
model=GradientBoostingClassifier(random_state=15)
# grid search - find best parameters
parameters = {'n_estimators':[10,50,100,200],
'max_depth':[5,10],
'min_samples_split':[3,7],
'min_samples_leaf':[1,2],
'max_features':['auto','log2',None]}
clf = GridSearchCV(model, param_grid=parameters)
grid_search = clf.fit(x_train, y_train)
# mark result
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
# best prarams
print('best prarams:', clf.best_params_)
print('-----grid search end------------')
print('on all train set')
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
print(scores.mean(), scores)
print('on test set')
scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
print(scores.mean(), scores)
#search_cv_gb(X_train, y_train, X_val, y_val)
Best score: 0.896
GradientBoostingClassifier(max_depth=10, max_features='log2',
min_samples_split=3, n_estimators=50,
random_state=15)
best prarams: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50}
-----grid search end------------
on all train set
0.8832254334479576 [0.88063823 0.89194139 0.87709668]
on test set
0.8642061670774267 [0.85507246 0.8702509 0.86729514]
clf = RandomForestClassifier(random_state=15)
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.88110 (+/- 0.00)
Micro f1 score: 0.8860318152102854
Results on the data set:
precision recall f1-score support
0 0.89 0.90 0.90 2492
1 0.88 0.87 0.87 2097
accuracy 0.89 4589
macro avg 0.89 0.88 0.89 4589
weighted avg 0.89 0.89 0.89 4589
def search_cv_rf(x_train, y_train, x_val, y_val):
'''Determines the optimal parameters for a random forest classifier for
the input data, and prints the optimal random forest classifier, a full
list of its parameters, and the cross-validation scores for both
training and validation data
Requires: train and validation data, both features and target in both cases
'''
model=RandomForestClassifier(random_state=15)
# grid search - find best parameters
parameters = {'n_estimators':[10,50,100,200],
'criterion':['gini','entropy'],
'max_depth':[5,10],
'min_samples_split':[3,7],
'min_samples_leaf':[1,2],
'max_features':['auto','log2',None]}
clf = GridSearchCV(model, param_grid=parameters)
grid_search = clf.fit(x_train, y_train)
# mark result
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
# best prarams
print('best prarams:', clf.best_params_)
print('-----grid search end------------')
print('on all train set')
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
print(scores.mean(), scores)
print('on test set')
scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
print(scores.mean(), scores)
#search_cv_rf(X_train, y_train, X_val, y_val)
clf = RandomForestClassifier(criterion= 'entropy', max_depth= 10, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 3, n_estimators= 200)
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.88124 (+/- 0.01)
Micro f1 score: 0.8932229243843974
Results on the data set:
precision recall f1-score support
0 0.89 0.91 0.90 2492
1 0.89 0.87 0.88 2097
accuracy 0.89 4589
macro avg 0.89 0.89 0.89 4589
weighted avg 0.89 0.89 0.89 4589
clf = MLPClassifier(random_state=15)
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.86579 (+/- 0.01)
Micro f1 score: 0.8842885160165613
Results on the data set:
precision recall f1-score support
0 0.88 0.91 0.90 2492
1 0.89 0.85 0.87 2097
accuracy 0.88 4589
macro avg 0.89 0.88 0.88 4589
weighted avg 0.88 0.88 0.88 4589
def search_mlp(x_train, y_train, x_val, y_val):
'''Determines the optimal parameters for a mlp classifier for
the input data, and prints the optimal mlp classifier, a full
list of its parameters, and the cross-validation scores for both
training and validation data
Requires: train and validation data, both features and target in both cases
'''
model=MLPClassifier(random_state=15)
# grid search - find best parameters
parameters = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],}
clf = GridSearchCV(model, param_grid=parameters)
grid_search = clf.fit(x_train, y_train)
# mark result
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
# best prarams
print('best prarams:', clf.best_params_)
print('-----grid search end------------')
print('on all train set')
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
print(scores.mean(), scores)
print('on test set')
scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
print(scores.mean(), scores)
clf = MLPClassifier(activation= 'tanh', alpha= 0.05, hidden_layer_sizes= (50, 100, 50), learning_rate= 'constant', solver= 'adam')
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.87091 (+/- 0.01)
Micro f1 score: 0.8805840052298975
Results on the data set:
precision recall f1-score support
0 0.87 0.91 0.89 2492
1 0.89 0.84 0.87 2097
accuracy 0.88 4589
macro avg 0.88 0.88 0.88 4589
weighted avg 0.88 0.88 0.88 4589
clf = KNeighborsClassifier()
print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.85889 (+/- 0.01)
Micro f1 score: 0.8651122248855961
Results on the data set:
precision recall f1-score support
0 0.88 0.87 0.87 2492
1 0.84 0.86 0.85 2097
accuracy 0.87 4589
macro avg 0.86 0.87 0.86 4589
weighted avg 0.87 0.87 0.87 4589
def transform_data_train(X, age=False):
cat_vars = ['Name', 'Longevity', 'TypeTravel', 'RoomType']
num_vars = list(set(X.columns) - set(cat_vars))
X_train_1 = X.drop(['Name'], axis=1)
X_train_1['Longevity'] = ['yes' if (i == 'yes' or i == 'y') else 'no' for i in X_train_1['Longevity']]
if 'Churn' in X_train_1.columns:
X_train_1.drop(['Churn'], axis=1, inplace=True)
X_train_2 = pd.get_dummies(X_train_1, drop_first = True)
# fill missing values (KNN Imputer for Year of Birth: first variable of numerical variables)
k_imputer = round(np.sqrt(len(X_train_2)),0).astype('int32') # 125
imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
imputer.fit(X_train_2)
data_KNN_train = imputer.transform(X_train_2)
data_KNN_train = pd.DataFrame(data_KNN_train)
data_KNN_train[0] = data_KNN_train[0].round(0)
X_train_2['Year_Birth'] = data_KNN_train[0].values
if age:
X_train_2['Age'] = date.today().year - X_train_2['Year_Birth']
X_train_2 = X_train_2.drop('Year_Birth', axis=1)
# Apply scaling to numerical data
scaler = MinMaxScaler().fit(X_train_2)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_2), columns = X_train_2.columns, index = X_train_2.index,) # MinMaxScaler in the training data
return X_train_scaled
models ={}
models["DecisionTree"] = DecisionTreeClassifier()
models["LogisticRegression"] = LogisticRegression()
models["ExtraTrees"] = ExtraTreesClassifier()
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']
X_train = transform_data_train(X, age=True)
print(set(X_train.columns) - set(transform_data_train(X, age=True).columns))
test_data_original = pd.read_csv('test.csv')
X_2 = test_data_original.iloc[:,1:]
X_test = transform_data_train(X_2, age=True)
set()
cv = StratifiedKFold(n_splits=5)
param_grid = {
"criterion": ["gini", "entropy", "log_loss"], # [320, 340, 360, 380, 400],
"max_depth": [32, None, 10000, 12000],
"max_features": [0.9909267486266218, 0.88],
"min_samples_leaf": [5, 3, 1],
"min_samples_split": [12, 6, 2]# [25, 30, 32, 34, 38, 45]
}
searchCV = GridSearchCV(estimator=models["DecisionTree"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)
searchCV.fit(X_train, y)
print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best index: 117
Best score: 0.9171850122329841
Best params: {'criterion': 'entropy', 'max_depth': 10000, 'max_features': 0.88, 'min_samples_leaf': 5, 'min_samples_split': 12}
selected = ["OnlineBooking", "BarService", "Age", "Comfort", "Staff"]
#To drop ['RewardPoints','ReceptionSchedule', 'Staff', 'OnlineBooking', 'RoomSpace', 'BarService']
X_train_selected = X_train[selected]
X_test_selected = X_test[selected]
from scipy.stats import loguniform
#Decision Tree
cv = StratifiedKFold(n_splits=5)
param_grid = {
"solver": ['newton-cg', 'lbfgs', 'liblinear'], # [320, 340, 360, 380, 400],
"penalty": ['none', 'l1', 'l2', 'elasticnet'],
"C": list(loguniform.rvs(1e-5, 100, size=30))
}
searchCV = GridSearchCV(estimator=models["LogisticRegression"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)
# WARNING: This could take some time to run.
searchCV.fit(X_train_selected, y)
print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best index: 149
Best score: 0.749694293722912
Best params: {'C': 0.007376306744870614, 'penalty': 'l1', 'solver': 'liblinear'}
cv = StratifiedKFold(n_splits=5)
param_grid = {
"bootstrap": [True, False], # [0, 1, 2, 3, 4],
"criterion": ["gini", "entropy", "log_loss"], # [320, 340, 360, 380, 400],
"max_depth": [32, None, 10000, 12000],
"max_features": [0.9909267486266218, 0.88],
"min_samples_leaf": [5, 3, 1],
"min_samples_split": [12, 6, 2]# [25, 30, 32, 34, 38, 45]
}
searchCV = GridSearchCV(estimator=models["ExtraTrees"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)
# WARNING: This could take some time to run.
searchCV.fit(X_train, y)
print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best index: 232
Best score: 0.9462441785544439
Best params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 32, 'max_features': 0.88, 'min_samples_leaf': 1, 'min_samples_split': 6}
best_so_far = {'bootstrap': False, 'criterion': 'gini', 'max_depth': 12000, 'max_features': 0.88, 'min_samples_leaf': 1, 'min_samples_split': 6}
X_train.columns, X_test.columns
(Index(['RewardPoints', 'Comfort', 'ReceptionSchedule', 'FoodDrink', 'Location',
'Wifi', 'Amenities', 'Staff', 'OnlineBooking', 'PriceQuality',
'RoomSpace', 'CheckOut', 'Checkin', 'Cleanliness', 'BarService',
'Longevity_yes', 'TypeTravel_leisure', 'RoomType_single',
'RoomType_suite', 'Age'],
dtype='object'),
Index(['RewardPoints', 'Comfort', 'ReceptionSchedule', 'FoodDrink', 'Location',
'Wifi', 'Amenities', 'Staff', 'OnlineBooking', 'PriceQuality',
'RoomSpace', 'CheckOut', 'Checkin', 'Cleanliness', 'BarService',
'Longevity_yes', 'TypeTravel_leisure', 'RoomType_single',
'RoomType_suite', 'Age'],
dtype='object'))
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']
X_train = transform_data_train(X, age=True)
test_data_original = pd.read_csv('test.csv')
X_2 = test_data_original.iloc[:,1:]
X_test = transform_data_train(X_2, age=True)
test_data_original = pd.read_csv('test.csv')
delivery_model_3 = ExtraTreesClassifier(**best_so_far)
delivery_model_3.fit(X_train, y)
yhat = delivery_model_3.predict(X_test)
sub = pd.DataFrame(data=yhat, columns=["Churn"])
sub["Cust_ID"] = test_data_original["Cust_ID"]
sub.set_index("Cust_ID", inplace=True)
sub['Churn'] = [1 if (i == 'churn') else 0 for i in sub['Churn']]
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']
X_train = transform_data_train(X, age=True)
test_data_original = pd.read_csv('test.csv')
X_2 = test_data_original.iloc[:,1:]
X_test = transform_data_train(X_2, age=True)
import (accuracy,
f1,
roc_auc,
precision,
average_precision,
recall,
log_loss)
#this package only work on unix based so, I've used docker to run it, will sent a tutorial attached
from autosklearn.classification import AutoSklearnClassifier
import autosklearn.metrics as skm
cv = StratifiedKFold(n_splits=5)
autoML_classifier = AutoSklearnClassifier(time_left_for_this_task=650,
max_models_on_disc=6,
resampling_strategy=cv,
ensemble_size = 4,
metric = skm.accuracy,
scoring_functions=[skm.accuracy, skm.roc_auc, skm.average_precision, skm.f1, skm.precision, skm.recall])
autoML_classifier.fit(X = X_train, y = y)
autoML_classifier.leaderboard(detailed = True, ensemble_only=False)
autoML_classifier.sprint_statistics()
df_test=df_test[df_data.columns]
#the testing dataframe is cropped: only the columns used in
#training are left
y_final = clf.predict(df_test)
#predicts the target for the test data
df_test['Churn']=y_final.copy()
df_test[['Churn']].to_csv('sub.csv')
#writes the results of the prediction in a csv file